The R packages needed for this entire WGCNA plus BioNERO script.
# basic
library(data.table)
library(tidyverse)
library(dplyr)
library(SummarizedExperiment)
# WGCNA
library(WGCNA)
library(flashClust)
library(BioNERO)
allowWGCNAThreads()
## Allowing multi-threading with up to 8 threads.
# heatmap color
library(colorspace) # I use
library(circlize) # Pablo use
# GO term
library(biomaRt)
Data preparation (small sample size)
Gene count
file is from normalized RUVseq result after regress out the top PC.
## data input
setwd("/Users/cl2375/OneDrive - Yale University/PhD study/Labs/Brennand Lab/RNA-seq/RNAseq_Pablo/WGCNA/Pablo_count")
gene_counts.df <- read.table(file = "normCounts_batch_corrected.txt", head = TRUE, sep = "\t", row.names = 1)
meta.df <- read.csv(file = "Pablo_meta.csv", head = TRUE, sep = ",", row.names = 1)
colnames(gene_counts.df) <- rownames(meta.df)
## data preprocessing
# replace missing value
exp_filt <- replace_na(gene_counts.df)
dim(exp_filt)
## [1] 16954 12
# remove non-expressed data
exp_filt <- remove_nonexp(exp_filt, method = "median", min_exp = 10)
dim(exp_filt)
## [1] 16077 12
# take top 10K variance genes
exp_filt <- filter_by_variance(exp_filt, n = 10000)
dim(exp_filt)
## [1] 10000 12
# filter outlier - could remove 1 outlier sample, but decide to not here since sample size is smaller
#exp_filt <- ZKfiltering(exp_filt, cor_method = "spearman")
#dim(exp_filt)
# adjust for PC
exp_filt <- PC_correction(exp_filt)
dim(exp_filt)
## [1] 10000 12
# set the final data used for following experiment
final_exp <- exp_filt
## explore data
p1 <- plot_heatmap(exp_filt, type = "samplecor", show_rownames = FALSE)
p1
p2 <- plot_PCA(exp_filt, metadata = meta.df[,2:3])
p2
p3 <- plot_heatmap(
exp_filt[1:50, ], type = "expr", show_rownames = FALSE, show_colnames = TRUE
)
p3
set power
| Power | SFT.R.sq | slope | truncated.R.sq | mean.k. | median.k. | max.k. |
|---|---|---|---|---|---|---|
| 9 | 0.7060 | -0.4660 | 0.893 | 193.0 | 161.0 | 496 |
| 10 | 0.7600 | -0.5150 | 0.892 | 169.0 | 137.0 | 452 |
| 11 | 0.8100 | -0.5590 | 0.916 | 150.0 | 119.0 | 414 |
| 12 | 0.8360 | -0.5980 | 0.907 | 135.0 | 104.0 | 382 |
| 13 | 0.8670 | -0.6290 | 0.913 | 121.0 | 91.8 | 353 |
BioNERO WGCNA
# bioNERO data preprocessing
final_exp <- exp_preprocess(
gene_counts.df, min_exp = 10, cor_method = 'spearman', variance_filter = TRUE, n = 10000)
## Number of removed samples: 1
# network module
#net <- exp2gcn(final_exp, net_type = "signed hybrid", SFTpower = 11, cor_method = "spearman", verbose = TRUE)
#saveRDS(net, file = "net_bioNERO_10kgene.rds")
net <- readRDS("net_bioNERO_10kgene.rds")
# Dendro and colors
plot_dendro_and_colors(net)
# Eigengene networks
plot_eigengene_network(net)
# number of genes per module
plot_ngenes_per_module(net)
## accessing module stability, cannot run with spearman
#module_stability(exp_filt, net, nRuns = 5)
## module-trait correlation
# traits
datTraits <- meta.df
datTraits$donor <- ifelse(datTraits$donor=="SCZ", 1, 0)
datTraits$treatment <- ifelse(datTraits$treatment=="TNF", 1, 0)
datTraits$group_HCP_NS <- ifelse(datTraits$group=="HCP_NS", 1, 0)
datTraits$group_HCP_TNF <- ifelse(datTraits$group=="HCP_TNF", 1, 0)
datTraits$group_SCZ_NS <- ifelse(datTraits$group=="SCZ_NS", 1, 0)
datTraits$group_SCZ_TNF <-ifelse(datTraits$group=="SCZ_TNF", 1, 0)
# set up se for easier run
nGenes = nrow(final_exp)
nSamples = 11
se <- SummarizedExperiment(assays = final_exp, colData = datTraits[-9,c(2:3,6:9)])
# calculate the p-value associated with the correlation
module.trait.correlation = cor(net$MEs, datTraits[-9,c(2:3,6:9)], use = "p", method = "spearman")
module.trait.Pvalue = corPvalueStudent(module.trait.correlation, nSamples)
# will display correlations and their p-values
textMatrix = paste(signif(module.trait.correlation, 2), "\n(",
signif(module.trait.Pvalue, 1), ")", sep = "");
dim(textMatrix) = dim(module.trait.correlation)
## plot
# color palette
ramp <- colorRamp2(seq(-2.5, 2.5, 0.1), hcl_palette = 'Blue-Red 3', reverse = FALSE)
colors <- attr(ramp, "colors")
# Display the correlation values within a heatmap plot
par(mar = c(5, 10, 3, 3))# good for R markdown
labeledHeatmap(Matrix = module.trait.correlation,
xLabels = names(datTraits[-9,c(2:3,6:9)]),
yLabels = names(net$MEs),
ySymbols = names(net$MEs),
colorLabels = FALSE,
colors = colors,
textMatrix = textMatrix,
setStdMargins = FALSE,
cex.text = 0.7,
cex.lab = 1,
zlim = c(-1,1),
main = paste("BioNERO Module-trait relationships"))
# Display the correlation values for the four group only
par(mar = c(5, 10, 3, 3))# good for R markdown
labeledHeatmap(Matrix = module.trait.correlation[,3:6],
xLabels = c("HCP_NS","HCP_TNF","SCZ_NS","SCZ_TNF"),
yLabels = names(net$MEs),
ySymbols = names(net$MEs),
colorLabels = FALSE,
colors = diverging_hcl(50, palette = "Blue-Red 3"),
textMatrix = textMatrix[,3:6],
setStdMargins = FALSE,
cex.text = 0.7,
cex.lab = 1,
zlim = c(-1,1),
main = paste("BioNERO Module-trait relationships"))
extract genes in modules
# read in annotation file
annotations.df <- read.csv(file="anno.csv", header=TRUE, sep=",", row.names = 1)
## hubgenes
## identify top hub gene
hubgene <- chooseTopHubInEachModule(
t(final_exp),
net$genes_and_modules["Modules"],
omitColors = "grey",
power = 11,
type = "signed hybrid")
# get hubgene symbol
hubgene_symbol <- annotations.df[annotations.df$ensembl %in% hubgene, c("ensembl","Gene_name")]
#write.csv(hubgene_symbol, "bioNERO_hubgene.csv", row.names = FALSE, quote = FALSE)
## identify multiple hub genes in a module
topHubs <- function (datExpr, colorh, omitColors = "grey", power = 2, type = "signed hybrid", ...)
{
# modified from chooseTopHubInEachModule, but return the table of all genes connectivity
isIndex = FALSE
modules = names(table(colorh))
if (!is.na(omitColors)[1])
modules = modules[!is.element(modules, omitColors)]
if (is.null(colnames(datExpr))) {
colnames(datExpr) = 1:dim(datExpr)[2]
isIndex = TRUE
}
connectivity_table <- data.frame(matrix(ncol = 3)) %>% setNames(c('gene', 'connectivity_rowSums_adj', 'module'))
hubs = rep(NA, length(modules))
names(hubs) = modules
for (m in modules) {
adj = adjacency(datExpr[, colorh == m], power = power, type = type, ...)
hub = which.max(rowSums(adj))
hubs[m] = colnames(adj)[hub]
sorted_genes <- rowSums(adj) %>% sort(decreasing = T) %>% as.data.frame() %>%
tibble::rownames_to_column() %>% setNames(c('gene', 'connectivity_rowSums_adj')) %>% mutate(module = m)
connectivity_table <- connectivity_table %>% rbind(sorted_genes)
}
if (isIndex) {
hubs = as.numeric(hubs)
names(hubs) = modules
}
return(connectivity_table %>% na.omit)
}
# get top 10 hub genes per module
hubgenes <- topHubs(t(final_exp), net$genes_and_modules["Modules"], omitColors = "grey", power = 11, type = "signed hybrid") %>% group_by(module) %>% top_n( 10, wt = connectivity_rowSums_adj)
# get the gene symbol of the hub genes
hubgenes_list <- annotations.df[annotations.df$ensembl %in% hubgenes$gene, c("ensembl","Gene_name")]
hubgenes_list <- merge(hubgenes_list, hubgenes, by.x = "ensembl", by.y = "gene", all.x = TRUE)
#write.csv(hubgenes_list, "bioNERO_hubgene_list.csv", row.names = FALSE, quote = FALSE)
## get gene list
# a list of color
color_ME <- colnames(net$MEs)
color <- sub("^ME", "", color_ME)
# function
extract_genes_by_color <- function(color, net, annotations.df) {
color_genes <- net$genes_and_modules %>%
filter(Modules == color) %>%
dplyr::select(Genes)
color_genes_info <- annotations.df %>%
filter(ensembl %in% color_genes$Genes) %>%
dplyr::select(ensembl, Gene_name)
write.csv(color_genes_info, paste0(color, "_genes.csv"), row.names = FALSE, quote = FALSE)
}
# run color
for (i in color){
extract_genes_by_color(i, net, annotations.df)
}
#extract_genes_by_color("mediumpurple1", net, annotations.df)
GO term enrichment
library(clusterProfiler)
## Registered S3 method overwritten by 'ggtree':
## method from
## fortify.igraph ggnetwork
## clusterProfiler v4.10.1 For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
##
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:biomaRt':
##
## select
## The following object is masked from 'package:IRanges':
##
## slice
## The following object is masked from 'package:S4Vectors':
##
## rename
## The following object is masked from 'package:purrr':
##
## simplify
## The following object is masked from 'package:stats':
##
## filter
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
library(AnnotationDbi)
# get module genes
setwd("/Users/cl2375/OneDrive - Yale University/PhD study/Labs/Brennand Lab/RNA-seq/RNAseq_Pablo/WGCNA/Pablo_count/ModuleCal_v3/bioNERO_ModuleGene_v2")
GO_list <- read.csv(file = "brown_genes.csv", head = TRUE)
# GO term enrichment
GO_results <- enrichGO(gene = GO_list[,1], OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP", pvalueCutoff = 0.05,
pAdjustMethod = "BH", rownames(exp_filt))
# reduce GO term redundancy
GO_results_sim <- simplify(GO_results)
# plot out top GO
plot(barplot(GO_results, showCategory = 20))
# plot out reduced redundancy top GO
plot(barplot(GO_results_sim, showCategory = 20))
No significant GO terms found for antiquewhite1_genes No significant GO
terms found for bisque4_genes No significant GO terms found for
blue_genes No significant GO terms found for blue3_genes No significant
GO terms found for coral1_genes No significant GO terms found for
coral4_genes No significant GO terms found for darkorange2_genes No
significant GO terms found for darkred_genes No significant GO terms
found for darkseagreen2_genes No significant GO terms found for
darkslateblue_genes No significant GO terms found for green_genes No
significant GO terms found for green4_genes No significant GO terms
found for honeydew_genes No significant GO terms found for
honeydew1_genes No significant GO terms found for indianred4_genes No
significant GO terms found for lightpink2_genes No significant GO terms
found for lightsteelblue1_genes No significant GO terms found for
magenta_genes No significant GO terms found for navajowhite1_genes No
significant GO terms found for orangered4_genes No significant GO terms
found for palevioletred_genes No significant GO terms found for
palevioletred1_genes No significant GO terms found for tan4_genes No
significant GO terms found for tomato_genes